library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(dplyr)
library(ggplot2)
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
daily_activity <- read.csv("Capstone Project/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
sleep_day <- read.csv("Capstone Project/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
hourly_steps <- read.csv("Capstone Project/Fitabase Data 4.12.16-5.12.16/hourlySteps_merged.csv")
head(daily_activity)
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366 4/12/2016 13162 8.50 8.50
## 2 1503960366 4/13/2016 10735 6.97 6.97
## 3 1503960366 4/14/2016 10460 6.74 6.74
## 4 1503960366 4/15/2016 9762 6.28 6.28
## 5 1503960366 4/16/2016 12669 8.16 8.16
## 6 1503960366 4/17/2016 9705 6.48 6.48
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1 0 1.88 0.55
## 2 0 1.57 0.69
## 3 0 2.44 0.40
## 4 0 2.14 1.26
## 5 0 2.71 0.41
## 6 0 3.19 0.78
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1 6.06 0 25
## 2 4.71 0 21
## 3 3.91 0 30
## 4 2.83 0 29
## 5 5.04 0 36
## 6 2.51 0 38
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1 13 328 728 1985
## 2 19 217 776 1797
## 3 11 181 1218 1776
## 4 34 209 726 1745
## 5 10 221 773 1863
## 6 20 164 539 1728
colnames(daily_activity)
## [1] "Id" "ActivityDate"
## [3] "TotalSteps" "TotalDistance"
## [5] "TrackerDistance" "LoggedActivitiesDistance"
## [7] "VeryActiveDistance" "ModeratelyActiveDistance"
## [9] "LightActiveDistance" "SedentaryActiveDistance"
## [11] "VeryActiveMinutes" "FairlyActiveMinutes"
## [13] "LightlyActiveMinutes" "SedentaryMinutes"
## [15] "Calories"
str(daily_activity)
## 'data.frame': 940 obs. of 15 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : chr "4/12/2016" "4/13/2016" "4/14/2016" "4/15/2016" ...
## $ TotalSteps : int 13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
## $ TotalDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ TrackerDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.44 2.14 2.71 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 0.4 1.26 0.41 ...
## $ LightActiveDistance : num 6.06 4.71 3.91 2.83 5.04 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 30 29 36 38 42 50 28 19 ...
## $ FairlyActiveMinutes : int 13 19 11 34 10 20 16 31 12 8 ...
## $ LightlyActiveMinutes : int 328 217 181 209 221 164 233 264 205 211 ...
## $ SedentaryMinutes : int 728 776 1218 726 773 539 1149 775 818 838 ...
## $ Calories : int 1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
head(sleep_day)
## Id SleepDay TotalSleepRecords TotalMinutesAsleep
## 1 1503960366 4/12/2016 12:00:00 AM 1 327
## 2 1503960366 4/13/2016 12:00:00 AM 2 384
## 3 1503960366 4/15/2016 12:00:00 AM 1 412
## 4 1503960366 4/16/2016 12:00:00 AM 2 340
## 5 1503960366 4/17/2016 12:00:00 AM 1 700
## 6 1503960366 4/19/2016 12:00:00 AM 1 304
## TotalTimeInBed
## 1 346
## 2 407
## 3 442
## 4 367
## 5 712
## 6 320
colnames(sleep_day)
## [1] "Id" "SleepDay" "TotalSleepRecords"
## [4] "TotalMinutesAsleep" "TotalTimeInBed"
str(sleep_day)
## 'data.frame': 413 obs. of 5 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ SleepDay : chr "4/12/2016 12:00:00 AM" "4/13/2016 12:00:00 AM" "4/15/2016 12:00:00 AM" "4/16/2016 12:00:00 AM" ...
## $ TotalSleepRecords : int 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep: int 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : int 346 407 442 367 712 320 377 364 384 449 ...
head(hourly_steps)
## Id ActivityHour StepTotal
## 1 1503960366 4/12/2016 12:00:00 AM 373
## 2 1503960366 4/12/2016 1:00:00 AM 160
## 3 1503960366 4/12/2016 2:00:00 AM 151
## 4 1503960366 4/12/2016 3:00:00 AM 0
## 5 1503960366 4/12/2016 4:00:00 AM 0
## 6 1503960366 4/12/2016 5:00:00 AM 0
colnames(hourly_steps)
## [1] "Id" "ActivityHour" "StepTotal"
str(hourly_steps)
## 'data.frame': 22099 obs. of 3 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityHour: chr "4/12/2016 12:00:00 AM" "4/12/2016 1:00:00 AM" "4/12/2016 2:00:00 AM" "4/12/2016 3:00:00 AM" ...
## $ StepTotal : int 373 160 151 0 0 0 0 0 250 1864 ...
## Understanding Summary Statistics
# How many unique participants are there in each data set
n_distinct(daily_activity$Id)
## [1] 33
n_distinct(sleep_day$Id)
## [1] 24
n_distinct(hourly_steps$Id)
## [1] 33
# Checking for duplicates
sum(duplicated(daily_activity))
## [1] 0
sum(duplicated(sleep_day))
## [1] 3
sum(duplicated(hourly_steps))
## [1] 0
# Removing Duplicates
daily_activity <- daily_activity %>%
distinct() %>%
drop_na()
sleep_day <- sleep_day %>%
distinct() %>%
drop_na()
hourly_steps <- hourly_steps %>%
distinct() %>%
drop_na()
# Verifying remove duplicates
sum(duplicated(sleep_day))
## [1] 0
daily_activity <- daily_activity %>%
rename(date = ActivityDate) %>%
mutate(date = as.Date(date, format = "%m/%d/%Y"))
sleep_day <- sleep_day %>%
rename(date = SleepDay) %>%
mutate(date=as.Date(date, format = "%m/%d/%Y %I:%M:%S %p" , tz = Sys.timezone()))
str(daily_activity)
## 'data.frame': 940 obs. of 15 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ date : Date, format: "2016-04-12" "2016-04-13" ...
## $ TotalSteps : int 13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
## $ TotalDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ TrackerDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.44 2.14 2.71 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 0.4 1.26 0.41 ...
## $ LightActiveDistance : num 6.06 4.71 3.91 2.83 5.04 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 30 29 36 38 42 50 28 19 ...
## $ FairlyActiveMinutes : int 13 19 11 34 10 20 16 31 12 8 ...
## $ LightlyActiveMinutes : int 328 217 181 209 221 164 233 264 205 211 ...
## $ SedentaryMinutes : int 728 776 1218 726 773 539 1149 775 818 838 ...
## $ Calories : int 1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
str(sleep_day)
## 'data.frame': 410 obs. of 5 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ date : Date, format: "2016-04-12" "2016-04-13" ...
## $ TotalSleepRecords : int 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep: int 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : int 346 407 442 367 712 320 377 364 384 449 ...
# convert date string to date-time in the hourly_steps.
hourly_steps<- hourly_steps %>%
rename(date_time = ActivityHour) %>%
mutate(date_time = as.POSIXct(date_time,format ="%m/%d/%Y %I:%M:%S %p" , tz=Sys.timezone()))
head(hourly_steps)
## Id date_time StepTotal
## 1 1503960366 2016-04-12 00:00:00 373
## 2 1503960366 2016-04-12 01:00:00 160
## 3 1503960366 2016-04-12 02:00:00 151
## 4 1503960366 2016-04-12 03:00:00 0
## 5 1503960366 2016-04-12 04:00:00 0
## 6 1503960366 2016-04-12 05:00:00 0
pulling statistics for analysis
count — no. of rows mean (average) std (standard deviation) min and max percentiles 25%, 50%, 75%
daily_activity %>%
select(TotalSteps,
TotalDistance,
SedentaryMinutes,
Calories) %>%
summary()
## TotalSteps TotalDistance SedentaryMinutes Calories
## Min. : 0 Min. : 0.000 Min. : 0.0 Min. : 0
## 1st Qu.: 3790 1st Qu.: 2.620 1st Qu.: 729.8 1st Qu.:1828
## Median : 7406 Median : 5.245 Median :1057.5 Median :2134
## Mean : 7638 Mean : 5.490 Mean : 991.2 Mean :2304
## 3rd Qu.:10727 3rd Qu.: 7.713 3rd Qu.:1229.5 3rd Qu.:2793
## Max. :36019 Max. :28.030 Max. :1440.0 Max. :4900
#number of active minutes per category
daily_activity %>%
select(VeryActiveMinutes,
FairlyActiveMinutes,
LightlyActiveMinutes,
SedentaryMinutes) %>%
summary()
## VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:127.0 1st Qu.: 729.8
## Median : 4.00 Median : 6.00 Median :199.0 Median :1057.5
## Mean : 21.16 Mean : 13.56 Mean :192.8 Mean : 991.2
## 3rd Qu.: 32.00 3rd Qu.: 19.00 3rd Qu.:264.0 3rd Qu.:1229.5
## Max. :210.00 Max. :143.00 Max. :518.0 Max. :1440.0
# sleep
sleep_day %>%
select(TotalSleepRecords,
TotalMinutesAsleep,
TotalTimeInBed) %>%
summary()
## TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## Min. :1.00 Min. : 58.0 Min. : 61.0
## 1st Qu.:1.00 1st Qu.:361.0 1st Qu.:403.8
## Median :1.00 Median :432.5 Median :463.0
## Mean :1.12 Mean :419.2 Mean :458.5
## 3rd Qu.:1.00 3rd Qu.:490.0 3rd Qu.:526.0
## Max. :3.00 Max. :796.0 Max. :961.0
# hourly
hourly_steps %>%
select(StepTotal) %>%
summary()
## StepTotal
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 40.0
## Mean : 320.2
## 3rd Qu.: 357.0
## Max. :10554.0
combined_data <- merge(daily_activity, sleep_day, by=c ("Id", "date")) %>%
mutate(weekday = weekdays(as.Date(date,"m/%d/%Y")))
n_distinct(combined_data$Id)
## [1] 24
glimpse(combined_data)
## Rows: 410
## Columns: 19
## $ Id <dbl> 1503960366, 1503960366, 1503960366, 150396036…
## $ date <date> 2016-04-12, 2016-04-13, 2016-04-15, 2016-04-…
## $ TotalSteps <int> 13162, 10735, 9762, 12669, 9705, 15506, 10544…
## $ TotalDistance <dbl> 8.50, 6.97, 6.28, 8.16, 6.48, 9.88, 6.68, 6.3…
## $ TrackerDistance <dbl> 8.50, 6.97, 6.28, 8.16, 6.48, 9.88, 6.68, 6.3…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance <dbl> 1.88, 1.57, 2.14, 2.71, 3.19, 3.53, 1.96, 1.3…
## $ ModeratelyActiveDistance <dbl> 0.55, 0.69, 1.26, 0.41, 0.78, 1.32, 0.48, 0.3…
## $ LightActiveDistance <dbl> 6.06, 4.71, 2.83, 5.04, 2.51, 5.03, 4.24, 4.6…
## $ SedentaryActiveDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveMinutes <int> 25, 21, 29, 36, 38, 50, 28, 19, 41, 39, 73, 3…
## $ FairlyActiveMinutes <int> 13, 19, 34, 10, 20, 31, 12, 8, 21, 5, 14, 23,…
## $ LightlyActiveMinutes <int> 328, 217, 209, 221, 164, 264, 205, 211, 262, …
## $ SedentaryMinutes <int> 728, 776, 726, 773, 539, 775, 818, 838, 732, …
## $ Calories <int> 1985, 1797, 1745, 1863, 1728, 2035, 1786, 177…
## $ TotalSleepRecords <int> 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ TotalMinutesAsleep <int> 327, 384, 412, 340, 700, 304, 360, 325, 361, …
## $ TotalTimeInBed <int> 346, 407, 442, 367, 712, 320, 377, 364, 384, …
## $ weekday <chr> "Tuesday", "Wednesday", "Friday", "Saturday",…
#Active users
active_users <- daily_activity %>%
filter(FairlyActiveMinutes >= 21.4 | VeryActiveMinutes>=10.7) %>%
group_by(Id) %>%
count(Id)
total_minutes <- sum(daily_activity$SedentaryMinutes, daily_activity$VeryActiveMinutes, daily_activity$FairlyActiveMinutes, daily_activity$LightlyActiveMinutes)
sedentary_percentage <- sum(daily_activity$SedentaryMinutes)/total_minutes*100
lightly_percentage <- sum(daily_activity$LightlyActiveMinutes)/total_minutes*100
fairly_percentage <- sum(daily_activity$FairlyActiveMinutes)/total_minutes*100
active_percentage <- sum(daily_activity$VeryActiveMinutes)/total_minutes*100
#Pie charts
percentage <- data.frame(
level=c("Sedentary", "Lightly Active", "Fairly Active", "Very Active"),
minutes=c(sedentary_percentage,lightly_percentage,fairly_percentage,active_percentage)
)
plot_ly(percentage, labels = ~level, values = ~minutes, type = 'pie',textposition = 'outside',textinfo = 'label+percent') %>%
layout(title = 'Activity Level Minutes',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
day_steps_sleep <- combined_data %>%
mutate(weekday = weekdays(date))
day_steps_sleep$weekday <- ordered(day_steps_sleep$weekday, levels=c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday"))
day_steps_sleep <- day_steps_sleep %>%
group_by(weekday) %>%
summarize (daily_steps = mean(TotalSteps), daily_sleep = mean(TotalMinutesAsleep))
head(day_steps_sleep)
## # A tibble: 6 × 3
## weekday daily_steps daily_sleep
## <ord> <dbl> <dbl>
## 1 Monday 9273. 420.
## 2 Tuesday 9183. 405.
## 3 Wednesday 8023. 435.
## 4 Thursday 8184. 401.
## 5 Friday 7901. 405.
## 6 Saturday 9871. 419.
ggplot(day_steps_sleep) +
geom_col(aes(weekday, daily_steps), fill = "#006699") +
geom_hline(yintercept = 7500) +
labs(title = "Daily steps per weekday", x= "Weekday", y = "Daily Steps") +
theme(axis.text.x = element_text(angle = 45,vjust = 0.5, hjust = 1))
This shows that participants are most active on Saturdays and least active on Sundays.
ggplot(day_steps_sleep, aes(weekday, daily_sleep)) +
geom_col(fill = "#85e0e0") +
geom_hline(yintercept = 480) +
labs(title = "Minutes asleep per weekday", x= "Weekday", y = "Sleep") +
theme(axis.text.x = element_text(angle = 45,vjust = 0.5, hjust = 1))
People Sleep more on Wednesdays and Sundays than any other days of the week. However this disparity is not that much
ggplot(data = combined_data, aes(x=weekday)) +
geom_bar(fill="green") +
labs(title = "Tracker usage across the week", x="Weekday")
More tracking is done on Thursdays, Tuesdays and Wednesdays.
ggplot(data=combined_data, aes(x=weekday, y=LoggedActivitiesDistance)) +
geom_bar(stat="identity", fill="green")+
labs(title="Logged Activity Distance by Day", y="Logged Activity Distance")
There are many blanks in the data and no records were available for Saturday and Sunday. The highest days of logged distance was on Monday.
hourly_steps <- hourly_steps %>%
separate(date_time, into = c ("date", "time"), sep= " ") %>%
mutate(date_time = ymd(date))
head(hourly_steps)
## Id date time StepTotal date_time
## 1 1503960366 2016-04-12 00:00:00 373 2016-04-12
## 2 1503960366 2016-04-12 01:00:00 160 2016-04-12
## 3 1503960366 2016-04-12 02:00:00 151 2016-04-12
## 4 1503960366 2016-04-12 03:00:00 0 2016-04-12
## 5 1503960366 2016-04-12 04:00:00 0 2016-04-12
## 6 1503960366 2016-04-12 05:00:00 0 2016-04-12
hourly_steps %>%
group_by(time) %>%
summarize(average_steps = mean(StepTotal)) %>%
ggplot() +
geom_col(mapping = aes(x=time, y = average_steps, fill = average_steps)) +
labs(title = "Hourly steps throughout the day", x="", y="") +
scale_fill_gradient(low = "green", high = "red")+
theme(axis.text.x = element_text(angle = 90))
1. The Hourly step distribution shows that people are most active between 5pm and 10pm, this could be due to walking after work hours.
2. They are also highly active from 12pm to 2pm and this could be due to break from work.
3. Bellabeat can create a reminder feature in their app at free times (lunch break and close of work) to remind the users to increase their movement.
ggplot(data=combined_data, aes(x=TotalSteps, y=Calories)) +
geom_jitter() + geom_point() +
geom_smooth(color = "red") +
labs(title = "Total Daily steps vs Calories", x = "Daily steps", y= "Calories")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
This shows that there is a positive correlation between Total Steps and Calories. The more steps taken, the more calories burnt.
ggplot(data=combined_data, aes(x=TotalSteps, y=TotalMinutesAsleep)) +
geom_jitter() + geom_point() +
geom_smooth(color = "red") +
labs(title = "Daily steps vs Minutes Asleep", x = "Daily steps", y= "Minutes Asleep")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
There is no correlation between daily activity level based on steps and the amount of minutes users sleep a day.
ggplot(data=combined_data, aes(x=TotalMinutesAsleep, y=SedentaryMinutes)) +
geom_jitter() + geom_point(color='darkblue') +
geom_smooth(color = "red") +
labs(title = "Sleep Time vs Sedentary Minutes" )
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
1. There is a negative correlation between sedentary minutes and sleep time.
2. To increase the correlation, the Bellabeat app could recommend to its users to reduce sedentary time so that they can sleep better
After going through the 7 Analysis steps and despite the limitations of the Dataset (The data is from Fitbit and not bellabeat and the sample size is small and for a limited number of time), we can recommend the following for Bellabeat marketing strategy:
More accurate data from Bellabeat users should be collected so that a more accurate analysis could be generated.
To reduce the sedentary lifestyle of 81.3% of the users, and get more of them fairly or lightly active, a notification feature at specific timing to remind users could be added to the bellabeat app.
Bellabeat can create group activities or content that encourages people to be more active on low step and high sedentary day like Sundays.
Provide app notification for users to remind them to get sufficient sleep every day and implement new sleep measurement features or products such as tracking sleep time and Rapid Eye Movement (REM) sleep.
Bellabeat can consider setting daily/weekly calorie challenges and award points to users based on the top performers. Where the points can be accumulated and redeemed as a discount for their next product purchase.
Bellabeat can suggest some ideas for low calorie breakfast, lunch, and dinner foods to help users that want to lose weight.
Bellabeat marketing team can create campaigns, educational contents, exercise suggestions and trackers on the bellabeat app and host physical fitness events and activities.
The App should have a good user experience and should be seamless to use.